#coding:utf-8
import sys,os
import json
sys.path.append('matchutils')
from algo.similarity import *
from algo.predict import *
from field_format import *
from config import *
from ext_series import gseries
from ext_edition import gedition

def get_keywords( item ):
    #fields = ['name', 'name_fr', 'name_ot']
    fields = ['name', 'name_ot', 'name_fr']
    names = []

    for field in fields:
        if field == 'name' or field == 'name_fr':
            val = item.get( field, '')
            if not val:
                continue
            names.append( val )
            val = val.replace(';', '/')
            words = val.split('/')
            for w in words:
                names.append( w )
        else:
            tmpnames = split_othernames( item.get( field,'') )
            names.extend( tmpnames )
        keywords = get_name_keywords( names )
        actor_list = item.get('director_name', None)
        actor_list = get_pname_keywords( actor_list )
        keywords.extend( actor_list )
        actor_list = item.get('actor_name', None)
        actor_list = get_pname_keywords( actor_list )
        keywords.extend( actor_list )
        keywords = [ w.lower().strip() for w in keywords ]
        keywords = list( set(keywords) )
        
        return keywords

def match( xitem, yitem, flag = True ):
    calc_sim = Similarity(yitem,xitem) 
    fea = calc_sim.get_features()
    prob = gbrt_model.predict( fea )
    vt = fea.get('video_type', 0 )
    vt = int(vt)
    if vt == 4 and fea['year_diff'] != 1 and fea['part_ok'] == 0 and prob > 0.5:
        prob /= 2
    elif vt in [1, 2, 4, 16] and prob >= 0.58:
        if fea['director_sim'] <= 0.01 and fea['actor_sim'] <= 0.01 and fea['year_diff'] <= 0.01 and fea['date_same'] <= 0.01 and fea['intro_sim'] is None and fea['part_ok'] == 0     and fea['epi_same'] <= 0.01:
            prob /= 2.0
        elif fea['director_sim'] <= 0.01 and fea['actor_sim'] <= 0.01 and fea['year_diff'] <= 0.01 and fea['date_same'] <= 0.01 and fea['intro_sim'] is None and fea['part_ok'] ==     0 and int(fea['video_type']) == 2:
            prob /= 2.0
        elif prob < 0.61 and fea['title_sim']>0.999 and fea['director_sim'] > 0.2 and fea['year_diff'] > 0.99 and fea['date_same'] > 0.99  and fea['part_ok'] == 0 and vt == 16:
            prob = 0.6

    if fea['title_sim'] < 0.90 and prob > 0.60:
        prob /= 2
    elif prob >= 0.6 and fea['delflag'] >= 0.499:
        prob /= 2

    xname = xitem['name']
    yname = yitem['name']
    if isinstance(xname, unicode):
        xname = xname.encode('utf-8', 'ignore')
    if isinstance(yname, unicode):
        yname = yname.encode('utf-8', 'ignore')
    #分类
    vt = xitem.get('video_type', 0 )
    if not vt:
        vt = yitem.get('video_type', 0 )
    if isinstance(vt, unicode) :
        vt = vt.encode('utf-8', 'ignore')
        vt = int(vt)
    elif isinstance(vt, str) :
        vt = int(vt)

    yname = yitem['name']
    #语言
    if prob > -0.001 and flag:
        if gedition.compare(xname, yname,vt ) < 0:
            prob = -0.3

    #系列剧
    if prob >= 0.6:
        s1 = gseries.get_series(xname)
        s2 = gseries.get_series(yname)
        if s1 and s2 and s1 != s2:
            s11 = gseries.get_syn( s1 )
            s22 = gseries.get_syn( s2 )
            if s11 == s2 or s1 == s22 or s11 == s22:
                return prob
            prob = -1.0

    return prob 


def run_match():
    xitem = {"actor_name": "", "name_ot": "", "name_fr": "", "name": "咱爸咱妈的美好时代", "album_id": "", "video_type": "4", "director_name": "无", "episodes": "11", "screen_time": "2015-01-01", "screen_year": "2015", "intro": "", "id": "cms_3701834", "video_type_flag": "0", "category_name": "生活"}
    yitem = {"actor_name": "", "name_ot": "", "name_fr": "", "name": "咱爸咱妈的美好时代 2015", "album_id": "", "video_type": "4", "director_name": "BTV生活频道", "episodes": "9", "screen_time": "", "screen_year": "", "intro": "简介：《咱爸咱妈的美好时代》是由北京市老龄委、北京电视台生活频道联合推出的一档全新的老年节目。一档服务老年人、受老年人喜爱的老年节目。简介：《咱爸咱妈的美好时代》是由北京市老龄委、北京电视台生活频道联合推出的一档全新的老年节目。一档服务老年人、受老年人喜爱的老年节目。", "id": "cms_3053761", "video_type_flag": "0", "category_name": "生活"}
    
    prob = match( xitem, yitem )
    if prob < 0.6:
        pass
    print json.dumps(xitem, ensure_ascii = False )
    print json.dumps(yitem, ensure_ascii = False )

if __name__ == '__main__':
    run_match()
